In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import load_model
In [2]:
#Loading the Historical fires data of Oregon state into a pandas dataframe
historical_fires = pd.read_csv('./data/historical_oregon_fires.csv', low_memory=False)
In [3]:
historical_fires.shape
Out[3]:
(64053, 89)
In [5]:
historical_fires.describe().T
Out[5]:
count mean std min 25% 50% 75% max
X 64053.0 7.703456e+05 5.145695e+05 0.0000 492431.605971 648903.492782 1.036540e+06 2.330063e+06
Y 64053.0 6.554200e+05 4.872065e+05 0.0000 236353.876969 575697.853675 1.076593e+06 1.656678e+06
OBJECTID 64053.0 3.202700e+04 1.849065e+04 1.0000 16014.000000 32027.000000 4.804000e+04 6.405300e+04
Serial 64053.0 4.987656e+04 2.812513e+04 10.0000 29500.000000 45519.000000 6.766000e+04 1.155570e+05
FireYear 64053.0 1.988937e+03 1.686442e+01 1960.0000 1974.000000 1989.000000 2.003000e+03 2.019000e+03
FiscalYear 64053.0 1.989727e+03 1.685892e+01 1960.0000 1975.000000 1990.000000 2.004000e+03 2.020000e+03
FireArea 0.0 NaN NaN NaN NaN NaN NaN NaN
Current_District 64053.0 7.864742e+01 1.496438e+01 51.0000 71.000000 73.000000 9.500000e+01 9.900000e+01
Current_Unit 64053.0 7.883178e+02 1.500064e+02 511.0000 711.000000 732.000000 9.540000e+02 9.910000e+02
FireCategory 64053.0 1.000000e+00 0.000000e+00 1.0000 1.000000 1.000000 1.000000e+00 1.000000e+00
NumberPersonnel 8742.0 6.891329e+00 1.379872e+01 0.0000 2.000000 4.000000 8.000000e+00 6.000000e+02
Sec 64017.0 1.656105e+01 1.145511e+01 0.0000 6.000000 16.000000 2.700000e+01 3.600000e+01
Longitude 56852.0 -1.221674e+02 1.760223e+00 -124.5570 -123.385000 -122.809000 -1.214570e+02 -1.165210e+02
Latitude 56853.0 4.374980e+01 1.247257e+00 41.8808 42.527600 43.612800 4.485010e+01 4.624220e+01
Protection_agency 63855.0 1.112442e+00 7.146813e-01 0.0000 1.000000 1.000000 1.000000e+00 9.000000e+00
Federal_lands 52339.0 1.137832e+00 4.811332e-01 1.0000 1.000000 1.000000 1.000000e+00 3.000000e+00
Land_Class 63851.0 1.781476e+00 1.830610e+00 0.0000 1.000000 1.000000 1.000000e+00 1.100000e+01
Land_Class_tg 28852.0 1.135103e+00 3.418398e-01 1.0000 1.000000 1.000000 1.000000e+00 2.000000e+00
Minimum 64053.0 1.129689e-01 3.165573e-01 0.0000 0.000000 0.000000 0.000000e+00 1.000000e+00
Dual 64053.0 1.093626e-01 3.120960e-01 0.0000 0.000000 0.000000 0.000000e+00 1.000000e+00
County 64035.0 1.643871e+01 8.441621e+00 1.0000 10.000000 17.000000 2.000000e+01 9.900000e+01
FO_Land_Owner 63964.0 1.828471e+01 1.595216e+01 0.0000 10.000000 10.000000 2.000000e+01 9.000000e+01
SurchargeLot 64053.0 9.033144e-02 2.866583e-01 0.0000 0.000000 0.000000 0.000000e+00 1.000000e+00
SurchargeLotAssessed 64053.0 5.781150e-02 2.333885e-01 0.0000 0.000000 0.000000 0.000000e+00 1.000000e+00
SB_360_Lot 12191.0 6.353868e-01 8.680971e-01 0.0000 0.000000 0.000000 2.000000e+00 2.000000e+00
SB_360_Liable 1379.0 3.930384e-01 4.886024e-01 0.0000 0.000000 0.000000 1.000000e+00 1.000000e+00
Discovered_By 63840.0 8.514474e+00 1.715136e+01 1.0000 3.000000 6.000000 8.000000e+00 9.900000e+01
Caused_by 63854.0 5.707144e+00 4.607953e+00 0.0000 1.000000 5.000000 9.000000e+00 5.800000e+01
General 64049.0 4.091321e+00 2.651601e+00 0.0000 1.000000 4.000000 6.000000e+00 1.000000e+01
Specific 63853.0 4.301206e+02 2.771839e+02 0.0000 101.000000 402.000000 6.200000e+02 9.990000e+02
... ... ... ... ... ... ... ... ...
Notif_number 1871.0 2.549896e+07 1.886472e+08 0.0000 389.500000 30094.000000 6.163350e+04 1.871208e+09
Walk_In_Delay 12183.0 9.069605e+00 4.201296e+02 0.0000 0.000000 0.000000 0.000000e+00 3.276700e+04
DS_Cost 63822.0 9.738210e+02 2.238572e+04 0.0000 43.000000 139.000000 4.612355e+02 4.098516e+06
ES_Cost 63820.0 1.083201e+04 3.282855e+05 0.0000 0.000000 0.000000 0.000000e+00 3.697226e+07
PC_Cost 63817.0 3.099745e+02 8.056938e+03 0.0000 0.000000 0.000000 0.000000e+00 9.077680e+05
OA_Cost 63817.0 6.464702e+03 2.737810e+05 0.0000 0.000000 0.000000 3.300000e+01 3.000000e+07
TotalCost 0.0 NaN NaN NaN NaN NaN NaN NaN
Est_Damage 57886.0 5.517955e+03 2.733955e+05 -478.0000 0.000000 0.000000 0.000000e+00 4.443510e+07
CostRecovery 64053.0 1.842224e-03 4.288192e-02 0.0000 0.000000 0.000000 0.000000e+00 1.000000e+00
FEMA 64053.0 4.744508e-02 2.125906e-01 0.0000 0.000000 0.000000 0.000000e+00 1.000000e+00
AttackBy 63838.0 3.422899e+00 1.078328e+01 1.0000 1.000000 1.000000 2.000000e+00 9.900000e+01
AttackType 63832.0 2.638395e+00 1.537800e+00 0.0000 1.000000 3.000000 4.000000e+00 9.000000e+00
Flame_length 16760.0 1.400477e+00 8.673592e-01 1.0000 1.000000 1.000000 2.000000e+00 6.000000e+00
Size_at_attack 29036.0 5.499523e+00 3.296863e+02 0.0000 0.010000 0.100000 2.500000e-01 5.081500e+04
Behavior 28810.0 2.110760e+00 1.445836e+00 1.0000 1.000000 2.000000 2.000000e+00 9.000000e+00
Fuel 63844.0 2.375573e+01 3.722856e+01 1.0000 3.000000 7.000000 1.200000e+01 9.900000e+01
Topography 64053.0 3.013286e-01 4.588387e-01 0.0000 0.000000 0.000000 1.000000e+00 1.000000e+00
Aspect 64053.0 3.016252e-01 4.589670e-01 0.0000 0.000000 0.000000 1.000000e+00 1.000000e+00
Slope 64053.0 3.016096e-01 4.589603e-01 0.0000 0.000000 0.000000 1.000000e+00 1.000000e+00
Elevation 64053.0 3.011569e-01 4.587643e-01 0.0000 0.000000 0.000000 1.000000e+00 1.000000e+00
Size_class 64052.0 1.371792e+00 7.271433e-01 1.0000 1.000000 1.000000 2.000000e+00 7.000000e+00
Size_acres 63851.0 8.138700e+01 3.348425e+03 0.0000 0.050000 0.100000 5.000000e-01 4.999450e+05
Size_prot 64046.0 2.029059e+01 4.610015e+02 0.0000 0.020000 0.100000 5.000000e-01 4.336800e+04
Homes_Saved 12212.0 1.371029e+00 2.253093e+01 0.0000 0.000000 0.000000 0.000000e+00 1.221000e+03
Homes_Lost 12212.0 2.030789e-02 4.760210e-01 0.0000 0.000000 0.000000 0.000000e+00 3.500000e+01
Structures_Saved 12212.0 1.378235e+00 2.185595e+01 0.0000 0.000000 0.000000 0.000000e+00 1.000000e+03
Structures_Lost 12212.0 5.707501e-02 9.832442e-01 0.0000 0.000000 0.000000 0.000000e+00 6.000000e+01
Number_of_Injuries 12212.0 7.443498e-02 6.343492e+00 0.0000 0.000000 0.000000 0.000000e+00 7.000000e+02
Number_of_Deaths 12212.0 1.719620e-03 7.293910e-02 0.0000 0.000000 0.000000 0.000000e+00 7.000000e+00
CauseType 64053.0 1.430222e-01 6.179719e-01 0.0000 0.000000 0.000000 0.000000e+00 3.000000e+00

66 rows × 8 columns

In [7]:
historical_fires.corr()
Out[7]:
X Y OBJECTID Serial FireYear FiscalYear FireArea Current_District Current_Unit FireCategory ... Size_class Size_acres Size_prot Homes_Saved Homes_Lost Structures_Saved Structures_Lost Number_of_Injuries Number_of_Deaths CauseType
X 1.000000 0.569673 -0.046110 -0.013913 0.270544 0.271425 NaN 0.582209 0.582790 NaN ... 0.041224 0.014888 0.014578 0.018966 0.013644 0.038826 0.032316 -0.003633 -0.007731 -0.004456
Y 0.569673 1.000000 -0.075587 -0.029982 0.249519 0.250744 NaN -0.035758 -0.034135 NaN ... 0.033210 0.001582 -0.001402 -0.000656 -0.013638 0.011960 0.002634 -0.005990 0.003807 0.012365
OBJECTID -0.046110 -0.075587 1.000000 0.975739 0.387223 0.386539 NaN 0.061891 0.061911 NaN ... 0.011245 0.021108 0.020731 0.009734 0.020665 0.015485 0.021889 -0.003186 0.002646 0.377476
Serial -0.013913 -0.029982 0.975739 1.000000 0.512250 0.511511 NaN 0.025019 0.025053 NaN ... 0.014625 0.022499 0.022293 0.010475 0.022059 0.017169 0.023424 -0.003854 0.002525 0.488493
FireYear 0.270544 0.249519 0.387223 0.512250 1.000000 0.999701 NaN -0.035637 -0.035524 NaN ... -0.001244 0.022978 0.016213 0.009203 0.021398 0.015568 0.021995 -0.003787 0.001735 0.389611
FiscalYear 0.271425 0.250744 0.386539 0.511511 0.999701 1.000000 NaN -0.035843 -0.035731 NaN ... -0.000912 0.023272 0.016663 0.011074 0.023118 0.017463 0.024769 -0.003147 0.002669 0.388431
FireArea NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Current_District 0.582209 -0.035758 0.061891 0.025019 -0.035637 -0.035843 NaN 1.000000 0.999978 NaN ... 0.021834 0.010784 0.018210 0.030827 0.008614 0.038056 0.023616 -0.002401 -0.012425 -0.044790
Current_Unit 0.582790 -0.034135 0.061911 0.025053 -0.035524 -0.035731 NaN 0.999978 1.000000 NaN ... 0.022103 0.010873 0.018298 0.031090 0.008595 0.038264 0.023702 -0.002380 -0.012483 -0.044747
FireCategory NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
NumberPersonnel 0.050067 -0.011351 0.038979 0.039914 0.038630 0.042701 NaN 0.048268 0.048037 NaN ... 0.456536 0.106450 0.174076 0.486013 NaN 0.726966 0.259737 -0.003466 NaN NaN
Sec 0.276743 0.241743 -0.068289 -0.018383 0.275254 0.275100 NaN -0.001605 -0.001552 NaN ... -0.016015 0.007551 0.000098 0.017810 0.015763 0.018883 0.014290 -0.005701 0.009145 0.037712
Longitude 0.999621 0.403465 0.029330 0.005394 -0.020575 -0.019295 NaN 0.735470 0.736070 NaN ... 0.068680 0.014185 0.023342 0.019023 0.013894 0.038717 0.032586 -0.003546 -0.008113 -0.055553
Latitude 0.430500 0.999848 -0.014095 -0.015591 -0.009079 -0.007390 NaN -0.033575 -0.031697 NaN ... 0.054684 -0.001556 0.001208 -0.000363 -0.013585 0.012202 0.002664 -0.006026 0.003687 -0.030219
Protection_agency 0.032814 0.069581 0.028075 0.036276 0.068306 0.068977 NaN -0.014353 -0.014340 NaN ... 0.135823 0.055002 0.032069 0.054253 0.073714 0.057667 0.086431 0.000206 0.004198 0.026403
Federal_lands -0.059000 -0.091272 -0.052258 -0.047595 -0.048299 -0.048102 NaN -0.056846 -0.057087 NaN ... 0.075656 0.039387 0.025605 0.035157 0.014590 0.043083 0.033986 -0.001039 0.030475 -0.025069
Land_Class -0.003016 0.044372 0.055674 0.058401 0.007211 0.008302 NaN -0.031734 -0.031625 NaN ... 0.115639 0.061540 0.030654 0.065291 0.050525 0.074048 0.073065 -0.001377 0.024840 0.053213
Land_Class_tg 0.157565 -0.059960 0.037642 0.045899 0.055574 0.057065 NaN 0.140536 0.140132 NaN ... 0.103653 -0.001977 -0.000250 -0.002734 -0.001286 -0.000082 0.001886 -0.004488 -0.004179 0.036976
Minimum -0.033101 0.042316 0.091073 0.117014 0.326262 0.325246 NaN -0.101031 -0.100726 NaN ... -0.060317 -0.007858 -0.011600 -0.017529 -0.003337 -0.019327 0.000293 -0.005802 -0.005715 0.138474
Dual -0.041756 0.031547 0.159198 0.187814 0.346062 0.345094 NaN -0.106424 -0.106219 NaN ... -0.033875 -0.007613 -0.010464 -0.014816 -0.004722 -0.019048 -0.003160 -0.006050 0.013413 0.174779
County 0.268164 0.199976 0.002904 -0.004643 -0.010720 -0.009931 NaN 0.229764 0.230056 NaN ... 0.038845 0.003358 0.002829 -0.000813 0.015776 0.002500 0.018109 -0.006797 0.023838 -0.006714
FO_Land_Owner -0.092466 -0.076038 0.040559 0.046112 0.051165 0.051341 NaN -0.097165 -0.097397 NaN ... -0.000267 0.024035 0.020590 0.016357 -0.001801 0.023054 0.006199 0.017379 0.013507 0.018109
SurchargeLot 0.003817 0.035827 0.274756 0.307456 0.361303 0.360186 NaN -0.072306 -0.072007 NaN ... -0.027278 -0.006203 -0.007550 -0.018503 0.001143 -0.016075 0.005061 -0.006289 -0.000029 0.212616
SurchargeLotAssessed 0.016206 0.027337 0.150192 0.159111 0.247901 0.247110 NaN -0.035084 -0.034857 NaN ... 0.010971 0.006559 0.011824 0.026464 0.018859 0.042898 0.037888 -0.002809 0.007866 0.089671
SB_360_Lot -0.036957 0.305162 -0.133812 -0.143383 -0.151897 -0.151038 NaN -0.262374 -0.262061 NaN ... 0.008907 0.006421 -0.005042 -0.015476 0.004041 0.000030 0.006110 -0.006837 -0.009516 -0.100448
SB_360_Liable -0.013105 -0.053098 0.064047 0.063442 0.060411 0.065249 NaN -0.029532 -0.028520 NaN ... 0.006749 0.010001 0.030298 0.039566 -0.017327 0.038527 -0.000260 -0.013745 -0.030668 0.062793
Discovered_By -0.026297 0.010418 -0.000592 -0.000420 -0.001773 -0.002144 NaN -0.061890 -0.061912 NaN ... -0.010416 0.000289 -0.002014 -0.002955 -0.002867 -0.001944 -0.001020 -0.001028 -0.001506 0.001609
Caused_by -0.180777 0.010206 0.034016 0.053407 0.094458 0.094952 NaN -0.232965 -0.232800 NaN ... -0.039441 -0.018645 -0.019359 -0.017531 -0.005667 -0.022366 -0.015159 -0.010449 0.015604 0.099491
General -0.249815 0.001324 -0.033927 -0.014261 0.023173 0.021539 NaN -0.307928 -0.307623 NaN ... 0.051790 -0.019316 -0.018939 -0.019199 0.013946 -0.020348 0.006313 -0.011420 0.011680 0.054432
Specific -0.253435 0.005833 -0.038920 -0.020418 0.031330 0.029667 NaN -0.318561 -0.318259 NaN ... 0.054835 -0.021936 -0.023244 -0.026318 0.010508 -0.031812 -0.001647 -0.011893 0.011448 0.049000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Notif_number -0.040257 0.003248 0.138567 0.165521 0.192332 0.191538 NaN -0.063862 -0.063995 NaN ... 0.011822 -0.006775 -0.006475 -0.008141 NaN -0.007303 -0.008921 -0.019120 NaN 0.219439
Walk_In_Delay 0.010980 0.006515 -0.000970 -0.001418 -0.002133 -0.001175 NaN 0.007946 0.007988 NaN ... -0.004820 0.001410 0.000592 0.000508 -0.000354 0.000172 -0.000054 -0.000211 -0.000349 0.005575
DS_Cost 0.006140 0.005339 0.029744 0.037114 0.040191 0.040404 NaN -0.000909 -0.000864 NaN ... 0.103026 0.148176 0.140773 0.158965 0.118909 0.172102 0.225208 0.004031 0.001656 0.037195
ES_Cost 0.004406 -0.005804 0.028985 0.034103 0.036039 0.036378 NaN 0.003722 0.003754 NaN ... 0.203951 0.117127 0.538398 0.208646 0.088508 0.124191 0.110395 0.007333 -0.000435 0.017519
PC_Cost -0.016291 0.009639 -0.012612 -0.010287 -0.003859 -0.003905 NaN -0.021788 -0.021815 NaN ... 0.123426 0.037992 0.058478 0.148666 0.085062 0.143068 0.213207 0.001818 -0.001112 -0.004021
OA_Cost 0.028216 0.010032 0.025436 0.028733 0.028818 0.029104 NaN 0.020714 0.020819 NaN ... 0.166249 0.210716 0.204437 0.222987 0.309615 0.148830 0.249992 0.005906 0.055465 0.011535
TotalCost NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Est_Damage 0.002900 -0.005078 0.011167 0.013746 0.016641 0.016848 NaN -0.000590 -0.000585 NaN ... 0.129803 0.140441 0.361981 0.235642 0.216242 0.225525 0.206857 0.007358 0.010977 0.000985
CostRecovery -0.003576 -0.001158 0.064246 0.078343 0.065573 0.065827 NaN -0.005411 -0.005341 NaN ... 0.115726 0.169630 0.163015 0.313700 0.147682 0.264737 0.210603 0.004544 -0.002319 0.066646
FEMA -0.006543 0.054456 0.077677 0.096058 0.211812 0.211446 NaN -0.056832 -0.056657 NaN ... 0.113937 0.009724 0.012814 0.021550 0.003644 0.020767 0.020403 -0.002090 -0.004512 0.111274
AttackBy -0.016568 0.004565 -0.038658 -0.036805 -0.016919 -0.016847 NaN -0.033194 -0.033177 NaN ... -0.012994 -0.001331 -0.004829 -0.006988 -0.004416 -0.006900 -0.003478 -0.001988 -0.003556 -0.010751
AttackType 0.062004 0.120003 0.043191 0.070952 0.188141 0.188685 NaN -0.073261 -0.072884 NaN ... 0.131670 0.017868 0.022897 0.029022 0.033178 0.049811 0.062438 0.006727 0.000718 0.065722
Flame_length -0.030116 -0.012313 0.024774 0.026842 0.031431 0.033724 NaN -0.041737 -0.041676 NaN ... 0.389546 0.050168 0.085713 0.137628 0.105252 0.149001 0.151168 NaN NaN NaN
Size_at_attack 0.023847 0.009183 -0.004119 -0.002668 -0.001009 -0.000670 NaN 0.017276 0.017314 NaN ... 0.100735 0.100504 0.041833 0.004069 0.008096 0.008147 0.040697 -0.000083 -0.000595 0.002353
Behavior -0.030251 0.011625 -0.033544 -0.037446 -0.042252 -0.042496 NaN -0.006868 -0.007063 NaN ... 0.208912 0.035164 0.046629 0.057725 0.043267 0.052336 0.053623 -0.004518 0.007543 -0.028705
Fuel -0.400156 -0.265852 0.039585 -0.027520 -0.450215 -0.449569 NaN -0.080206 -0.080425 NaN ... -0.031521 -0.010888 -0.013374 -0.013394 0.001123 -0.013492 -0.006558 -0.001690 0.007884 -0.086634
Topography 0.096087 0.068489 -0.091927 -0.085490 0.476654 0.476873 NaN -0.035946 -0.035890 NaN ... -0.029478 0.008816 -0.000125 0.001980 0.016206 -0.003902 0.004359 -0.003955 0.014473 0.093577
Aspect 0.096190 0.068973 -0.091296 -0.084815 0.477251 0.477474 NaN -0.036552 -0.036498 NaN ... -0.029847 0.008798 -0.000153 0.001915 0.016142 -0.003965 0.004287 -0.003966 0.014434 0.094116
Slope 0.096220 0.069228 -0.091175 -0.084710 0.477256 0.477477 NaN -0.036572 -0.036516 NaN ... -0.029783 0.008799 -0.000151 0.001924 0.016142 -0.003965 0.004287 -0.003966 0.014434 0.094346
Elevation 0.095528 0.068704 -0.091183 -0.084958 0.476524 0.476756 NaN -0.037122 -0.037066 NaN ... -0.029993 0.008825 -0.000115 0.001971 0.014600 -0.004041 0.002813 -0.003946 0.014506 0.092412
Size_class 0.041224 0.033210 0.011245 0.014625 -0.001244 -0.000912 NaN 0.021834 0.022103 NaN ... 1.000000 0.179155 0.286786 0.279775 0.155975 0.284261 0.237754 0.005499 0.003354 0.004100
Size_acres 0.014888 0.001582 0.021108 0.022499 0.022978 0.023272 NaN 0.010784 0.010873 NaN ... 0.179155 1.000000 0.328017 0.302674 0.198700 0.415370 0.364108 0.004140 0.006211 0.010768
Size_prot 0.014578 -0.001402 0.020731 0.022293 0.016213 0.016663 NaN 0.018210 0.018298 NaN ... 0.286786 0.328017 1.000000 0.324740 0.220377 0.390251 0.427034 0.011438 0.002197 0.005513
Homes_Saved 0.018966 -0.000656 0.009734 0.010475 0.009203 0.011074 NaN 0.030827 0.031090 NaN ... 0.279775 0.302674 0.324740 1.000000 0.279326 0.724782 0.354939 0.300181 0.010176 0.008187
Homes_Lost 0.013644 -0.013638 0.020665 0.022059 0.021398 0.023118 NaN 0.008614 0.008595 NaN ... 0.155975 0.198700 0.220377 0.279326 1.000000 0.365731 0.829151 0.008015 0.086264 0.013673
Structures_Saved 0.038826 0.011960 0.015485 0.017169 0.015568 0.017463 NaN 0.038056 0.038264 NaN ... 0.284261 0.415370 0.390251 0.724782 0.365731 1.000000 0.483552 0.306562 0.004113 0.009780
Structures_Lost 0.032316 0.002634 0.021889 0.023424 0.021995 0.024769 NaN 0.023616 0.023702 NaN ... 0.237754 0.364108 0.427034 0.354939 0.829151 0.483552 1.000000 0.007105 0.058010 0.010279
Number_of_Injuries -0.003633 -0.005990 -0.003186 -0.003854 -0.003787 -0.003147 NaN -0.002401 -0.002380 NaN ... 0.005499 0.004140 0.011438 0.300181 0.008015 0.306562 0.007105 1.000000 0.000431 -0.005187
Number_of_Deaths -0.007731 0.003807 0.002646 0.002525 0.001735 0.002669 NaN -0.012425 -0.012483 NaN ... 0.003354 0.006211 0.002197 0.010176 0.086264 0.004113 0.058010 0.000431 1.000000 0.012935
CauseType -0.004456 0.012365 0.377476 0.488493 0.389611 0.388431 NaN -0.044790 -0.044747 NaN ... 0.004100 0.010768 0.005513 0.008187 0.013673 0.009780 0.010279 -0.005187 0.012935 1.000000

66 rows × 66 columns

In [ ]:
 
In [9]:
drop_columns = ['OBJECTID', 'Serial'  ,'FireNumber' ,'FiscalYear','FireArea','Current_District', 'Current_Unit','FireName',
               'PreparedBy', 'LandmarkLocation', 'Equipment','NumberPersonnel', 'CreationDate', 'ModifiedBy', 'ModifiedDate',
                'Twn', 'Rng', 'Sec', 'Subdiv', 'County','LO_Name','DiscoveredByName','Specific',
                'Cause_Comments','Lead_Investigator', 'Degree_certain','General_Restriction', 'Industrial_Restriction',
                'RegulatedUseZone','Involve_op', 'Notif_year','Notif_dist', 'Notif_number','Report_DateTime', 'Attack_DateTime',
                'Control_DateTime','Walk_In_Delay', 'DS_Cost', 'ES_Cost', 'PC_Cost', 'OA_Cost','TotalCost', 'Est_Damage', 'CostRecovery',
               'AttackBy','AttackType','Size_class','Size_prot', 'Homes_Saved', 'Homes_Lost','Structures_Saved', 'Structures_Lost',
                'Number_of_Injuries','Number_of_Deaths', 'CauseType']
In [10]:
historical_fires.drop(columns=drop_columns,inplace=True)
In [11]:
use_later = ["Burn_Index","WeatherStationID"]
historical_fires.drop(columns=use_later,inplace=True)
In [12]:
#Convert columns to datetime format
historical_fires['Ign_DateTime']=pd.to_datetime(historical_fires['Ign_DateTime'])
historical_fires['Discover_DateTime']=pd.to_datetime(historical_fires['Discover_DateTime'])
In [13]:
historical_fires['fire_dayofyear'] = historical_fires['Discover_DateTime'].dt.dayofyear
#historical_fires['fire_dayofyear'] = historical_fires['fire_dayofyear'].astype(int)
In [14]:
historical_fires = historical_fires.fillna(0)
In [15]:
#Removing time columns
time_columns = ['Ign_DateTime','Discover_DateTime']
historical_fires.drop(columns=time_columns,inplace=True)
In [18]:
#The target to predict, area burnt by fire.
historical_fires['Size_acres'].describe()
Out[18]:
count     64053.000000
mean         81.130331
std        3343.144286
min           0.000000
25%           0.050000
50%           0.100000
75%           0.500000
max      499945.000000
Name: Size_acres, dtype: float64
In [19]:
#checking for Null Values in the data
historical_fires.isnull().sum()
Out[19]:
X                       0
Y                       0
FireYear                0
FireCategory            0
Longitude               0
Latitude                0
Protection_agency       0
Federal_lands           0
Land_Class              0
Land_Class_tg           0
Minimum                 0
Dual                    0
FO_Land_Owner           0
SurchargeLot            0
SurchargeLotAssessed    0
SB_360_Lot              0
SB_360_Liable           0
Discovered_By           0
Caused_by               0
General                 0
FEMA                    0
Flame_length            0
Size_at_attack          0
Behavior                0
Fuel                    0
Topography              0
Aspect                  0
Slope                   0
Elevation               0
Size_acres              0
fire_dayofyear          0
dtype: int64
In [20]:
import matplotlib.pyplot as plt
import numpy as np
# Normal Distribution of fire size (target)
# Central Limit Theory applies
plt.hist(historical_fires['Size_acres'], bins=np.logspace(-4,6, 50));
plt.ylabel('Number of Fires')
plt.xlabel('Fire Size')
plt.gca().set_yscale("log")
plt.gca().set_xscale("log")
plt.show()
In [23]:
# !pip install plotly-express
import plotly.express as px

fig = px.scatter(historical_fires, x = "Longitude", y = "Latitude", color = "FireYear", opacity=0.2)

fig.show()

0, 0 is Null Island: https://en.wikipedia.org/wiki/Null_Island

Often when working with data that includes latitudes and longitudes, 0,0 is an identifier for potentially erroneous data

In [24]:
# 7200 observations at Null Island
len(historical_fires.query('Longitude==0 & Latitude==0'))
Out[24]:
7200
In [25]:
# 99.99% of latitudes are between ...
np.percentile(historical_fires['Latitude'], 0.01), np.percentile(historical_fires['Latitude'], 99.99)
Out[25]:
(0.0, 46.23157843999999)
In [26]:
# 99.99% of longitudes are between ...
np.percentile(historical_fires['Longitude'], 0.01), np.percentile(historical_fires['Longitude'], 99.99)
Out[26]:
(-124.52756880000001, 0.0)

Remove Outliers

In [27]:
# Remove all the Null Islands
historical_fires =  historical_fires[(historical_fires['Latitude'] >0) &
        (historical_fires['Longitude']<0)]
In [28]:
# Look at the relationship between latitude/longitude and year that the fire occurred in
fig = px.scatter(historical_fires, x = "Longitude", y = "Latitude", color = "FireYear", opacity=0.2)

fig.show()
In [29]:
# Look at the relationship between latitude, longitude, fire year and fire size (in acres)
fig = px.scatter(historical_fires, x = "Longitude", y = "Latitude", color = "FireYear", size="Size_acres", opacity =0.9)

fig.show()
In [30]:
# Overlaying fire latitude/longitude to stamen-terrain mapbox layout
fig = px.scatter_mapbox(historical_fires, lat='Latitude', lon='Longitude', color='FireYear', opacity=0.1)
fig.update_layout(mapbox_style='stamen-terrain')
fig.show()
In [31]:
# Cluster the locations
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, n_jobs=-1)
historical_fires['cluster'] = kmeans.fit_predict(historical_fires[['Longitude', 'Latitude']])
px.scatter(historical_fires, x='Longitude', y='Latitude', color='cluster')
In [80]:
# Cluster the locations based on GIS acres
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=7, n_jobs=-1)
explore = historical_fires.copy()
explore['cluster'] = kmeans.fit_predict(explore[['Size_acres']])
px.scatter(explore, x='Longitude', y='Latitude', color='cluster')
In [50]:
def engineer_features(df):
    # What's the terrain, based on latitude & longitude?
    clusters = pd.get_dummies(df['cluster'], prefix='cluster')
    for col in clusters:
        df[col] = clusters[col]
        
    # Remove Outlier/invalid values for 'caused_by' column
    df = df[df['Caused_by'] < 15]
    
    #As the standard deviation is very high, removing the outliers from the data
    df = df[df['Size_acres'] < 500]
    
    #Based on pandas-profiling report, we're rejecting features highly-correlated with other criteria
    correlated_columns = ['Slope', 'Aspect', 'Topography', 'X', 'Y']
    df = df.drop(columns=correlated_columns)
    
    # remove unusable variance - constant value
    unusable_variance = ['FireCategory']
    df = df.drop(columns=unusable_variance)
    
    return df
In [51]:
# wrangle dataframe using engineering features function
historical_fires = engineer_features(historical_fires)
In [54]:
#Creating the data for training
Y = historical_fires['Size_acres']
X = historical_fires.drop(columns='Size_acres')
In [55]:
# #Normalising the Data
sc = StandardScaler()
X_norm = sc.fit_transform(X)
In [56]:
#creating the train and test data
X_train,X_test,y_train,y_test = train_test_split(X_norm,Y,test_size = 0.2)
print("# Train: {} , #Test: {}".format(X_train.shape[0], X_test.shape[0]))
print("# inputs: {}".format(X_train.shape[1]))
n = X_train.shape[1]
# Train: 41381 , #Test: 10346
# inputs: 35
In [58]:
cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
ax = sns.scatterplot(x="Longitude", y="Latitude",
                     hue="Caused_by", size="Size_acres",
                     sizes=(20, 200),palette="Set1", data=historical_fires)
In [41]:
import pandas_profiling
pandas_profiling.ProfileReport(X_train)
Out[41]:

Overview

Dataset info

Number of variables 42
Number of observations 41669
Total Missing (%) 0.0%
Total size in memory 10.4 MiB
Average record size in memory 262.0 B

Variables types

Numeric 19
Categorical 0
Boolean 17
Date 0
Text (Unique) 0
Rejected 6
Unsupported 0

Warnings

Variables

Aspect
Highly correlated

This variable is highly correlated with Topography and should be ignored for analysis

Correlation 0.99903

Behavior
Numeric

Distinct count 9
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.0486
Minimum 0
Maximum 9
Zeros (%) 49.9%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 1
Q3 2
95-th percentile 3
Maximum 9
Range 9
Interquartile range 2

Descriptive statistics

Standard deviation 1.4522
Coef of variation 1.3849
Kurtosis 11.138
Mean 1.0486
MAD 1.0608
Skewness 2.6526
Sum 43695
Variance 2.1089
Memory size 325.6 KiB
Value Count Frequency (%)  
0.0 20800 49.9%
 
2.0 11689 28.1%
 
1.0 5973 14.3%
 
3.0 1786 4.3%
 
4.0 682 1.6%
 
9.0 601 1.4%
 
6.0 61 0.1%
 
7.0 49 0.1%
 
5.0 28 0.1%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 20800 49.9%
 
1.0 5973 14.3%
 
2.0 11689 28.1%
 
3.0 1786 4.3%
 
4.0 682 1.6%
 

Maximum 5 values

Value Count Frequency (%)  
4.0 682 1.6%
 
5.0 28 0.1%
 
6.0 61 0.1%
 
7.0 49 0.1%
 
9.0 601 1.4%
 

Caused_by
Numeric

Distinct count 15
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.9731
Minimum 0
Maximum 14
Zeros (%) 0.4%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 1
Median 5
Q3 8
95-th percentile 13
Maximum 14
Range 14
Interquartile range 7

Descriptive statistics

Standard deviation 3.8403
Coef of variation 0.7722
Kurtosis -0.70066
Mean 4.9731
MAD 3.1701
Skewness 0.61847
Sum 207230
Variance 14.748
Memory size 325.6 KiB
Value Count Frequency (%)  
1.0 13559 32.5%
 
5.0 7390 17.7%
 
9.0 5026 12.1%
 
13.0 3293 7.9%
 
7.0 2432 5.8%
 
2.0 2255 5.4%
 
6.0 2070 5.0%
 
3.0 1824 4.4%
 
10.0 1296 3.1%
 
4.0 1100 2.6%
 
Other values (5) 1424 3.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 161 0.4%
 
1.0 13559 32.5%
 
2.0 2255 5.4%
 
3.0 1824 4.4%
 
4.0 1100 2.6%
 

Maximum 5 values

Value Count Frequency (%)  
10.0 1296 3.1%
 
11.0 262 0.6%
 
12.0 113 0.3%
 
13.0 3293 7.9%
 
14.0 91 0.2%
 

Discovered_By
Numeric

Distinct count 14
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 8.2852
Minimum 0
Maximum 99
Zeros (%) 0.4%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 3
Median 6
Q3 8
95-th percentile 9
Maximum 99
Range 99
Interquartile range 5

Descriptive statistics

Standard deviation 16.266
Coef of variation 1.9632
Kurtosis 26.498
Mean 8.2852
MAD 5.6874
Skewness 5.2694
Sum 345230
Variance 264.57
Memory size 325.6 KiB
Value Count Frequency (%)  
6.0 12197 29.3%
 
8.0 6086 14.6%
 
1.0 3730 9.0%
 
3.0 3602 8.6%
 
9.0 3175 7.6%
 
7.0 3090 7.4%
 
2.0 2952 7.1%
 
4.0 2917 7.0%
 
5.0 1951 4.7%
 
99.0 1270 3.0%
 
Other values (4) 699 1.7%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 150 0.4%
 
1.0 3730 9.0%
 
2.0 2952 7.1%
 
3.0 3602 8.6%
 
4.0 2917 7.0%
 

Maximum 5 values

Value Count Frequency (%)  
9.0 3175 7.6%
 
10.0 475 1.1%
 
11.0 73 0.2%
 
13.0 1 0.0%
 
99.0 1270 3.0%
 

Dual
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1204
0
36652
1
 
5017
Value Count Frequency (%)  
0 36652 88.0%
 
1 5017 12.0%
 

Elevation
Highly correlated

This variable is highly correlated with Slope and should be ignored for analysis

Correlation 0.99764

FEMA
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.054237
0
39409
1
 
2260
Value Count Frequency (%)  
0 39409 94.6%
 
1 2260 5.4%
 

FO_Land_Owner
Numeric

Distinct count 22
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 18.305
Minimum 0
Maximum 90
Zeros (%) 0.1%

Quantile statistics

Minimum 0
5-th percentile 10
Q1 10
Median 11
Q3 20
95-th percentile 50
Maximum 90
Range 90
Interquartile range 10

Descriptive statistics

Standard deviation 15.955
Coef of variation 0.87165
Kurtosis 5.0835
Mean 18.305
MAD 11.361
Skewness 2.2726
Sum 762740
Variance 254.57
Memory size 325.6 KiB
Value Count Frequency (%)  
10.0 19695 47.3%
 
13.0 4951 11.9%
 
50.0 4444 10.7%
 
11.0 3978 9.5%
 
20.0 2959 7.1%
 
12.0 2047 4.9%
 
30.0 1540 3.7%
 
40.0 734 1.8%
 
90.0 573 1.4%
 
70.0 194 0.5%
 
Other values (12) 554 1.3%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 59 0.1%
 
1.0 172 0.4%
 
2.0 24 0.1%
 
3.0 9 0.0%
 
4.0 3 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
50.0 4444 10.7%
 
60.0 118 0.3%
 
70.0 194 0.5%
 
80.0 9 0.0%
 
90.0 573 1.4%
 

Federal_lands
Numeric

Distinct count 4
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.0473
Minimum 0
Maximum 3
Zeros (%) 8.3%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 1
Median 1
Q3 1
95-th percentile 3
Maximum 3
Range 3
Interquartile range 0

Descriptive statistics

Standard deviation 0.56382
Coef of variation 0.53838
Kurtosis 5.8411
Mean 1.0473
MAD 0.2537
Skewness 1.792
Sum 43638
Variance 0.31789
Memory size 325.6 KiB
Value Count Frequency (%)  
1.0 34978 83.9%
 
0.0 3469 8.3%
 
3.0 2216 5.3%
 
2.0 1006 2.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3469 8.3%
 
1.0 34978 83.9%
 
2.0 1006 2.4%
 
3.0 2216 5.3%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 3469 8.3%
 
1.0 34978 83.9%
 
2.0 1006 2.4%
 
3.0 2216 5.3%
 

FireCategory
Constant

This variable is constant and should be ignored for analysis

Constant value 1

FireYear
Numeric

Distinct count 59
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1992
Minimum 1960
Maximum 2019
Zeros (%) 0.0%

Quantile statistics

Minimum 1960
5-th percentile 1969
Q1 1979
Median 1992
Q3 2005
95-th percentile 2016
Maximum 2019
Range 59
Interquartile range 26

Descriptive statistics

Standard deviation 15.017
Coef of variation 0.0075386
Kurtosis -1.1464
Mean 1992
MAD 12.864
Skewness 0.064528
Sum 83005611
Variance 225.51
Memory size 325.6 KiB
Value Count Frequency (%)  
1987 1249 3.0%
 
1970 1219 2.9%
 
1992 1207 2.9%
 
1994 1043 2.5%
 
1974 1014 2.4%
 
1975 995 2.4%
 
1973 986 2.4%
 
2006 943 2.3%
 
2001 934 2.2%
 
1985 920 2.2%
 
Other values (49) 31159 74.8%
 

Minimum 5 values

Value Count Frequency (%)  
1960 4 0.0%
 
1961 6 0.0%
 
1962 2 0.0%
 
1963 2 0.0%
 
1964 5 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2015 771 1.9%
 
2016 594 1.4%
 
2017 812 1.9%
 
2018 820 2.0%
 
2019 408 1.0%
 

Flame_length
Numeric

Distinct count 7
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.40308
Minimum 0
Maximum 6
Zeros (%) 70.8%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 1
95-th percentile 2
Maximum 6
Range 6
Interquartile range 1

Descriptive statistics

Standard deviation 0.77548
Coef of variation 1.9239
Kurtosis 12.563
Mean 0.40308
MAD 0.57116
Skewness 2.9311
Sum 16796
Variance 0.60137
Memory size 325.6 KiB
Value Count Frequency (%)  
0.0 29522 70.8%
 
1.0 9156 22.0%
 
2.0 2068 5.0%
 
3.0 530 1.3%
 
4.0 183 0.4%
 
6.0 132 0.3%
 
5.0 78 0.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 29522 70.8%
 
1.0 9156 22.0%
 
2.0 2068 5.0%
 
3.0 530 1.3%
 
4.0 183 0.4%
 

Maximum 5 values

Value Count Frequency (%)  
2.0 2068 5.0%
 
3.0 530 1.3%
 
4.0 183 0.4%
 
5.0 78 0.2%
 
6.0 132 0.3%
 

Fuel
Numeric

Distinct count 15
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.588
Minimum 0
Maximum 99
Zeros (%) 0.4%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 3
Median 5
Q3 9
95-th percentile 99
Maximum 99
Range 99
Interquartile range 6

Descriptive statistics

Standard deviation 29.31
Coef of variation 1.8803
Kurtosis 4.1519
Mean 15.588
MAD 18.132
Skewness 2.456
Sum 649520
Variance 859.06
Memory size 325.6 KiB
Value Count Frequency (%)  
3.0 7048 16.9%
 
5.0 5433 13.0%
 
1.0 4938 11.9%
 
8.0 4919 11.8%
 
99.0 4529 10.9%
 
2.0 3689 8.9%
 
7.0 2590 6.2%
 
12.0 2432 5.8%
 
9.0 1920 4.6%
 
6.0 1276 3.1%
 
Other values (5) 2895 6.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 148 0.4%
 
1.0 4938 11.9%
 
2.0 3689 8.9%
 
3.0 7048 16.9%
 
4.0 214 0.5%
 

Maximum 5 values

Value Count Frequency (%)  
10.0 1255 3.0%
 
11.0 549 1.3%
 
12.0 2432 5.8%
 
13.0 729 1.7%
 
99.0 4529 10.9%
 

General
Numeric

Distinct count 11
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.8775
Minimum 0
Maximum 10
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 1
Median 4
Q3 6
95-th percentile 9
Maximum 10
Range 10
Interquartile range 5

Descriptive statistics

Standard deviation 2.5849
Coef of variation 0.66663
Kurtosis -0.91615
Mean 3.8775
MAD 2.2163
Skewness 0.43665
Sum 161570
Variance 6.6816
Memory size 325.6 KiB
Value Count Frequency (%)  
1.0 13587 32.6%
 
6.0 7819 18.8%
 
3.0 6375 15.3%
 
4.0 4245 10.2%
 
5.0 3292 7.9%
 
9.0 3075 7.4%
 
8.0 1610 3.9%
 
7.0 802 1.9%
 
2.0 784 1.9%
 
10.0 77 0.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3 0.0%
 
1.0 13587 32.6%
 
2.0 784 1.9%
 
3.0 6375 15.3%
 
4.0 4245 10.2%
 

Maximum 5 values

Value Count Frequency (%)  
6.0 7819 18.8%
 
7.0 802 1.9%
 
8.0 1610 3.9%
 
9.0 3075 7.4%
 
10.0 77 0.2%
 

Land_Class
Numeric

Distinct count 12
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.769
Minimum 0
Maximum 11
Zeros (%) 0.7%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 1
Median 1
Q3 1
95-th percentile 7
Maximum 11
Range 11
Interquartile range 0

Descriptive statistics

Standard deviation 1.8189
Coef of variation 1.0282
Kurtosis 5.0591
Mean 1.769
MAD 1.1956
Skewness 2.473
Sum 73713
Variance 3.3083
Memory size 325.6 KiB
Value Count Frequency (%)  
1.0 31710 76.1%
 
2.0 4072 9.8%
 
7.0 1524 3.7%
 
5.0 1170 2.8%
 
6.0 748 1.8%
 
4.0 747 1.8%
 
8.0 744 1.8%
 
3.0 350 0.8%
 
0.0 296 0.7%
 
9.0 218 0.5%
 
Other values (2) 90 0.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 296 0.7%
 
1.0 31710 76.1%
 
2.0 4072 9.8%
 
3.0 350 0.8%
 
4.0 747 1.8%
 

Maximum 5 values

Value Count Frequency (%)  
7.0 1524 3.7%
 
8.0 744 1.8%
 
9.0 218 0.5%
 
10.0 89 0.2%
 
11.0 1 0.0%
 

Land_Class_tg
Numeric

Distinct count 3
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.57018
Minimum 0
Maximum 2
Zeros (%) 49.9%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 1
Q3 1
95-th percentile 2
Maximum 2
Range 2
Interquartile range 1

Descriptive statistics

Standard deviation 0.61843
Coef of variation 1.0846
Kurtosis -0.57603
Mean 0.57018
MAD 0.56847
Skewness 0.6035
Sum 23759
Variance 0.38245
Memory size 325.6 KiB
Value Count Frequency (%)  
0.0 20772 49.9%
 
1.0 18035 43.3%
 
2.0 2862 6.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 20772 49.9%
 
1.0 18035 43.3%
 
2.0 2862 6.9%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 20772 49.9%
 
1.0 18035 43.3%
 
2.0 2862 6.9%
 

Latitude
Highly correlated

This variable is highly correlated with Y and should be ignored for analysis

Correlation 0.99991

Longitude
Highly correlated

This variable is highly correlated with X and should be ignored for analysis

Correlation 0.99964

Minimum
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.12388
0
36507
1
 
5162
Value Count Frequency (%)  
0 36507 87.6%
 
1 5162 12.4%
 

Protection_agency
Numeric

Distinct count 10
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.1153
Minimum 0
Maximum 9
Zeros (%) 0.3%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 1
Median 1
Q3 1
95-th percentile 1
Maximum 9
Range 9
Interquartile range 0

Descriptive statistics

Standard deviation 0.73505
Coef of variation 0.65905
Kurtosis 44.797
Mean 1.1153
MAD 0.23057
Skewness 6.5649
Sum 46474
Variance 0.54029
Memory size 325.6 KiB
Value Count Frequency (%)  
1.0 40286 96.7%
 
6.0 491 1.2%
 
3.0 314 0.8%
 
5.0 157 0.4%
 
0.0 142 0.3%
 
7.0 107 0.3%
 
4.0 91 0.2%
 
2.0 46 0.1%
 
9.0 30 0.1%
 
8.0 5 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 142 0.3%
 
1.0 40286 96.7%
 
2.0 46 0.1%
 
3.0 314 0.8%
 
4.0 91 0.2%
 

Maximum 5 values

Value Count Frequency (%)  
5.0 157 0.4%
 
6.0 491 1.2%
 
7.0 107 0.3%
 
8.0 5 0.0%
 
9.0 30 0.1%
 

SB_360_Liable
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.0094795
0.0
41274
1.0
 
395
Value Count Frequency (%)  
0.0 41274 99.1%
 
1.0 395 0.9%
 

SB_360_Lot
Numeric

Distinct count 3
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.13478
Minimum 0
Maximum 2
Zeros (%) 92.0%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 2
Maximum 2
Range 2
Interquartile range 0

Descriptive statistics

Standard deviation 0.47641
Coef of variation 3.5348
Kurtosis 10.238
Mean 0.13478
MAD 0.2481
Skewness 3.437
Sum 5616
Variance 0.22696
Memory size 325.6 KiB
Value Count Frequency (%)  
0.0 38352 92.0%
 
2.0 2299 5.5%
 
1.0 1018 2.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 38352 92.0%
 
1.0 1018 2.4%
 
2.0 2299 5.5%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 38352 92.0%
 
1.0 1018 2.4%
 
2.0 2299 5.5%
 

Size_at_attack
Numeric

Distinct count 285
Unique (%) 0.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.0355
Minimum 0
Maximum 50815
Zeros (%) 51.8%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0.1
95-th percentile 1.5
Maximum 50815
Range 50815
Interquartile range 0.1

Descriptive statistics

Standard deviation 260.28
Coef of variation 85.748
Kurtosis 34968
Mean 3.0355
MAD 5.6317
Skewness 181.37
Sum 126480
Variance 67748
Memory size 325.6 KiB
Value Count Frequency (%)  
0.0 21569 51.8%
 
0.01 7063 17.0%
 
0.1 4409 10.6%
 
0.25 1893 4.5%
 
0.5 1022 2.5%
 
1.0 793 1.9%
 
0.2 539 1.3%
 
2.0 427 1.0%
 
0.02 409 1.0%
 
1.5 290 0.7%
 
Other values (275) 3255 7.8%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 21569 51.8%
 
0.01 7063 17.0%
 
0.02 409 1.0%
 
0.03 110 0.3%
 
0.04 48 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
3000.0 3 0.0%
 
3723.0 1 0.0%
 
7050.0 1 0.0%
 
11615.0 1 0.0%
 
50815.0 1 0.0%
 

Slope
Highly correlated

This variable is highly correlated with Aspect and should be ignored for analysis

Correlation 0.99941

SurchargeLot
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.10295
0
37379
1
 
4290
Value Count Frequency (%)  
0 37379 89.7%
 
1 4290 10.3%
 

SurchargeLotAssessed
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.066812
0
38885
1
 
2784
Value Count Frequency (%)  
0 38885 93.3%
 
1 2784 6.7%
 

Topography
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.33677
0
27636
1
14033
Value Count Frequency (%)  
0 27636 66.3%
 
1 14033 33.7%
 

X
Numeric

Distinct count 31863
Unique (%) 76.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 881940
Minimum 224480
Maximum 2330100
Zeros (%) 0.0%

Quantile statistics

Minimum 224480
5-th percentile 367460
Q1 545730
Median 705170
Q3 1074300
95-th percentile 1936900
Maximum 2330100
Range 2105600
Interquartile range 528560

Descriptive statistics

Standard deviation 469600
Coef of variation 0.53246
Kurtosis 0.36579
Mean 881940
MAD 375690
Skewness 1.1456
Sum 36749000000
Variance 220520000000
Memory size 325.6 KiB
Value Count Frequency (%)  
526103.824146986 28 0.1%
 
562406.384842515 25 0.1%
 
536428.536745414 24 0.1%
 
571746.639435694 22 0.1%
 
546578.327099741 21 0.1%
 
457387.64238844794 20 0.0%
 
428648.02460630203 19 0.0%
 
545210.855314955 19 0.0%
 
623516.884514436 18 0.0%
 
536614.990157485 17 0.0%
 
Other values (31853) 41456 99.5%
 

Minimum 5 values

Value Count Frequency (%)  
224481.370734915 2 0.0%
 
226531.344816267 1 0.0%
 
228714.39796587802 1 0.0%
 
229226.158792645 1 0.0%
 
231490.077427819 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2270734.09120736 1 0.0%
 
2271832.16240157 1 0.0%
 
2288277.34776902 1 0.0%
 
2293224.94717848 1 0.0%
 
2330062.90485564 1 0.0%
 

Y
Numeric

Distinct count 31863
Unique (%) 76.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 743900
Minimum 54599
Maximum 1656700
Zeros (%) 0.0%

Quantile statistics

Minimum 54599
5-th percentile 155730
Q1 295660
Median 701500
Q3 1144000
95-th percentile 1478800
Maximum 1656700
Range 1602100
Interquartile range 848320

Descriptive statistics

Standard deviation 454330
Coef of variation 0.61075
Kurtosis -1.2942
Mean 743900
MAD 403300
Skewness 0.25417
Sum 30997000000
Variance 206420000000
Memory size 325.6 KiB
Value Count Frequency (%)  
292139.513779521 28 0.1%
 
264508.78083989 25 0.1%
 
286449.714566931 24 0.1%
 
696971.049212605 22 0.1%
 
275555.029855639 21 0.1%
 
161345.489501312 20 0.0%
 
228183.73392388198 19 0.0%
 
1643614.27559055 19 0.0%
 
187974.796259835 18 0.0%
 
297458.22473754 17 0.0%
 
Other values (31853) 41456 99.5%
 

Minimum 5 values

Value Count Frequency (%)  
54598.510826766505 1 0.0%
 
66764.41568242009 1 0.0%
 
90169.05479002 1 0.0%
 
90816.4665354341 1 0.0%
 
90865.3143044561 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1649814.66699475 1 0.0%
 
1651909.77132545 1 0.0%
 
1652731.50098425 1 0.0%
 
1652765.31102362 3 0.0%
 
1656677.58923885 1 0.0%
 

cluster
Numeric

Distinct count 10
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.125
Minimum 0
Maximum 9
Zeros (%) 11.6%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 2
Median 4
Q3 7
95-th percentile 9
Maximum 9
Range 9
Interquartile range 5

Descriptive statistics

Standard deviation 2.803
Coef of variation 0.6795
Kurtosis -1.065
Mean 4.125
MAD 2.3494
Skewness 0.23585
Sum 171886
Variance 7.8566
Memory size 325.6 KiB
Value Count Frequency (%)  
4 6562 15.7%
 
2 5349 12.8%
 
0 4815 11.6%
 
3 4707 11.3%
 
1 3713 8.9%
 
9 3622 8.7%
 
8 3587 8.6%
 
5 3311 7.9%
 
7 3239 7.8%
 
6 2764 6.6%
 

Minimum 5 values

Value Count Frequency (%)  
0 4815 11.6%
 
1 3713 8.9%
 
2 5349 12.8%
 
3 4707 11.3%
 
4 6562 15.7%
 

Maximum 5 values

Value Count Frequency (%)  
5 3311 7.9%
 
6 2764 6.6%
 
7 3239 7.8%
 
8 3587 8.6%
 
9 3622 8.7%
 

cluster_0
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.11555
0
36854
1
 
4815
Value Count Frequency (%)  
0 36854 88.4%
 
1 4815 11.6%
 

cluster_1
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.089107
0
37956
1
 
3713
Value Count Frequency (%)  
0 37956 91.1%
 
1 3713 8.9%
 

cluster_2
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.12837
0
36320
1
 
5349
Value Count Frequency (%)  
0 36320 87.2%
 
1 5349 12.8%
 

cluster_3
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.11296
0
36962
1
 
4707
Value Count Frequency (%)  
0 36962 88.7%
 
1 4707 11.3%
 

cluster_4
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.15748
0
35107
1
 
6562
Value Count Frequency (%)  
0 35107 84.3%
 
1 6562 15.7%
 

cluster_5
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.07946
0
38358
1
 
3311
Value Count Frequency (%)  
0 38358 92.1%
 
1 3311 7.9%
 

cluster_6
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.066332
0
38905
1
 
2764
Value Count Frequency (%)  
0 38905 93.4%
 
1 2764 6.6%
 

cluster_7
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.077732
0
38430
1
 
3239
Value Count Frequency (%)  
0 38430 92.2%
 
1 3239 7.8%
 

cluster_8
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.086083
0
38082
1
 
3587
Value Count Frequency (%)  
0 38082 91.4%
 
1 3587 8.6%
 

cluster_9
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.086923
0
38047
1
 
3622
Value Count Frequency (%)  
0 38047 91.3%
 
1 3622 8.7%
 

fire_dayofyear
Numeric

Distinct count 362
Unique (%) 0.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 211.73
Minimum 0
Maximum 363
Zeros (%) 0.4%

Quantile statistics

Minimum 0
5-th percentile 129
Q1 186
Median 214
Q3 242
95-th percentile 285
Maximum 363
Range 363
Interquartile range 56

Descriptive statistics

Standard deviation 48.275
Coef of variation 0.22801
Kurtosis 1.8562
Mean 211.73
MAD 36.111
Skewness -0.72874
Sum 8822400
Variance 2330.5
Memory size 325.6 KiB
Value Count Frequency (%)  
207.0 593 1.4%
 
213.0 570 1.4%
 
222.0 546 1.3%
 
221.0 471 1.1%
 
220.0 469 1.1%
 
212.0 469 1.1%
 
217.0 452 1.1%
 
208.0 449 1.1%
 
224.0 445 1.1%
 
219.0 442 1.1%
 
Other values (352) 36763 88.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 162 0.4%
 
2.0 3 0.0%
 
3.0 1 0.0%
 
4.0 1 0.0%
 
5.0 5 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
359.0 3 0.0%
 
360.0 2 0.0%
 
361.0 3 0.0%
 
362.0 2 0.0%
 
363.0 1 0.0%
 

index
Numeric

Distinct count 41669
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 31015
Minimum 1
Maximum 64052
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 2877.4
Q1 14101
Median 28361
Q3 49577
95-th percentile 61217
Maximum 64052
Range 64051
Interquartile range 35476

Descriptive statistics

Standard deviation 19380
Coef of variation 0.62488
Kurtosis -1.3213
Mean 31015
MAD 17010
Skewness 0.13852
Sum 1292347969
Variance 375590000
Memory size 325.6 KiB
Value Count Frequency (%)  
2047 1 0.0%
 
48541 1 0.0%
 
52659 1 0.0%
 
50610 1 0.0%
 
56753 1 0.0%
 
54704 1 0.0%
 
9646 1 0.0%
 
15789 1 0.0%
 
13740 1 0.0%
 
3499 1 0.0%
 
Other values (41659) 41659 100.0%
 

Minimum 5 values

Value Count Frequency (%)  
1 1 0.0%
 
2 1 0.0%
 
3 1 0.0%
 
4 1 0.0%
 
9 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
64046 1 0.0%
 
64048 1 0.0%
 
64049 1 0.0%
 
64050 1 0.0%
 
64052 1 0.0%
 

Correlations

Sample

X Y FireYear FireCategory Longitude Latitude Protection_agency Federal_lands Land_Class Land_Class_tg Minimum Dual FO_Land_Owner SurchargeLot SurchargeLotAssessed SB_360_Lot SB_360_Liable Discovered_By Caused_by General FEMA Flame_length Size_at_attack Behavior Fuel Topography Aspect Slope Elevation fire_dayofyear cluster cluster_0 cluster_1 cluster_2 cluster_3 cluster_4 cluster_5 cluster_6 cluster_7 cluster_8 cluster_9
8262 1.596414e+06 9.844365e+05 1999 1 -119.412 44.4456 1.0 1.0 2.0 2.0 0 0 20.0 0 0 0.0 0.0 2.0 1.0 1.0 0 3.0 5.00 3.0 6.0 1 1 1 1 215.0 1 0 1 0 0 0 0 0 0 0 0
49257 5.106075e+05 2.815337e+05 2004 1 -123.472 42.4824 1.0 1.0 1.0 1.0 0 0 20.0 0 0 0.0 0.0 6.0 13.0 3.0 0 2.0 0.75 3.0 5.0 1 1 1 1 224.0 4 0 0 0 0 1 0 0 0 0 0
45688 7.859479e+05 1.249405e+06 2001 1 -122.541 45.1597 1.0 1.0 1.0 1.0 1 1 13.0 1 0 0.0 0.0 9.0 5.0 6.0 0 1.0 0.00 1.0 1.0 1 1 1 1 222.0 6 0 0 0 0 0 0 1 0 0 0
14671 5.275577e+05 1.803938e+05 1991 1 -123.396 42.2068 1.0 1.0 1.0 0.0 0 0 50.0 0 0 0.0 0.0 1.0 1.0 1.0 0 0.0 0.00 0.0 7.0 0 0 0 0 203.0 4 0 0 0 0 1 0 0 0 0 0
59157 4.586384e+05 1.656392e+05 2014 1 -123.648 42.1594 1.0 1.0 5.0 1.0 0 0 20.0 0 0 0.0 0.0 6.0 5.0 5.0 0 0.0 0.01 2.0 8.0 1 1 1 1 246.0 4 0 0 0 0 1 0 0 0 0 0
In [60]:
# Calculating Baseline Prediction errors
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
# Base prediction is mean of the y_train
y_base = [np.mean(y_train)]*y_test.shape[0]
In [61]:
# Mean Errors for Baseline Prediction 
prediction_errors = {}
prediction_errors['Base'] = [MAE(y_test,y_base),MSE(y_test,y_base)]
# prediction_errors['Base MSE'] = MAE(y_test,y_base)
print(f'Mean absolute error for baseline prediction is {MAE(y_test,y_base)}')
print(f'Mean standard error for baseline prediction is {MSE(y_test,y_base)}')
Mean absolute error for base prediction is 6.340975638593508
Mean standard error for base prediction is 611.4065088511755
In [62]:
# Using Logistic Regression to train the model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
Out[62]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [63]:
# MAE and MSE using Linear Regression
y_pred = lr.predict(X_test)
# Errors for Linear Prediction 
prediction_errors['Linear Regression'] = [MAE(y_test,y_pred),MSE(y_test,y_pred)]
print(f'Mean absolute error for Linear Regression is {MAE(y_test,y_pred)}')
print(f'Mean standard error for Linear Regression is {MSE(y_test,y_pred)}')
Mean absolute error for Linear Regression is 5.613454464205515
Mean standard error for Linear Regression is 497.7229091110607
In [64]:
# Using RandomForest to train the model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
Out[64]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
In [65]:
# MAE and MSE using Random Forest Regression
y_pred = rf.predict(X_test)
# Mean Errors for Random Forest Prediction 
prediction_errors['Random Forest'] = [MAE(y_test,y_pred),MSE(y_test,y_pred)]
print(f'Mean absolute error for Linear Regression is {MAE(y_test,y_pred)}')
print(f'Mean standard error for Linear Regression is {MSE(y_test,y_pred)}')
Mean absolute error for Linear Regression is 4.920883348297547
Mean standard error for Linear Regression is 489.64086958981477
In [66]:
# Training the data using Ridge Regression
from sklearn.linear_model import Ridge
rr = Ridge(alpha=1.0)
rr.fit(X_train, y_train)
Out[66]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
In [68]:
# MAE and MSE using Ridge
y_pred = rr.predict(X_test)
# Mean Errors for Ridge Prediction 
prediction_errors['Ridge Regression'] = [MAE(y_test,y_pred),MSE(y_test,y_pred)]
print(f'Mean absolute error for Ridge is {MAE(y_test,y_pred)}')
print(f'Mean standard error for Ridge is {MSE(y_test,y_pred)}')
Mean absolute error for Ridge is 5.619662768105583
Mean standard error for Ridge is 497.723412149486
In [71]:
# Building a Neural network to predict the area burned by fire in Oregon
model = Sequential()
model.add(Dense(35, input_dim=n, kernel_initializer='normal', activation='relu'))
model.add(Dense(70, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(56, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(28, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(14, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))
model.summary()
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_20 (Dense)             (None, 35)                1260      
_________________________________________________________________
dense_21 (Dense)             (None, 70)                2520      
_________________________________________________________________
dropout_12 (Dropout)         (None, 70)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 128)               9088      
_________________________________________________________________
dropout_13 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 256)               33024     
_________________________________________________________________
dropout_14 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_24 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 56)                7224      
_________________________________________________________________
dropout_16 (Dropout)         (None, 56)                0         
_________________________________________________________________
dense_26 (Dense)             (None, 28)                1596      
_________________________________________________________________
dropout_17 (Dropout)         (None, 28)                0         
_________________________________________________________________
dense_27 (Dense)             (None, 14)                406       
_________________________________________________________________
dense_28 (Dense)             (None, 8)                 120       
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 9         
=================================================================
Total params: 88,143
Trainable params: 88,143
Non-trainable params: 0
_________________________________________________________________
In [72]:
model.compile(loss='mae', optimizer='adam', metrics=['mse','mae'])
In [73]:
history = model.fit(X_train, y_train.values, epochs=20, batch_size=1028, verbose = 2, validation_split = 0.05, shuffle=True)
Train on 39311 samples, validate on 2070 samples
Epoch 1/20
39311/39311 - 5s - loss: 3.8166 - mse: 605.4470 - mae: 3.8166 - val_loss: 2.8327 - val_mse: 284.7296 - val_mae: 2.8327
Epoch 2/20
39311/39311 - 2s - loss: 3.6482 - mse: 561.8547 - mae: 3.6482 - val_loss: 2.5907 - val_mse: 243.4479 - val_mae: 2.5907
Epoch 3/20
39311/39311 - 2s - loss: 3.3818 - mse: 527.8478 - mae: 3.3818 - val_loss: 2.3467 - val_mse: 233.6139 - val_mae: 2.3467
Epoch 4/20
39311/39311 - 2s - loss: 3.2943 - mse: 495.7027 - mae: 3.2943 - val_loss: 2.3197 - val_mse: 232.6433 - val_mae: 2.3197
Epoch 5/20
39311/39311 - 2s - loss: 3.2833 - mse: 490.7452 - mae: 3.2833 - val_loss: 2.3384 - val_mse: 233.3045 - val_mae: 2.3384
Epoch 6/20
39311/39311 - 2s - loss: 3.2603 - mse: 490.4917 - mae: 3.2603 - val_loss: 2.2713 - val_mse: 233.0452 - val_mae: 2.2713
Epoch 7/20
39311/39311 - 2s - loss: 3.2743 - mse: 499.8663 - mae: 3.2743 - val_loss: 2.3355 - val_mse: 239.2533 - val_mae: 2.3355
Epoch 8/20
39311/39311 - 3s - loss: 3.2850 - mse: 498.4460 - mae: 3.2850 - val_loss: 2.2677 - val_mse: 233.7367 - val_mae: 2.2677
Epoch 9/20
39311/39311 - 2s - loss: 3.2476 - mse: 495.4905 - mae: 3.2476 - val_loss: 2.2567 - val_mse: 234.5774 - val_mae: 2.2567
Epoch 10/20
39311/39311 - 2s - loss: 3.2536 - mse: 500.7908 - mae: 3.2536 - val_loss: 2.2573 - val_mse: 233.0869 - val_mae: 2.2573
Epoch 11/20
39311/39311 - 3s - loss: 3.2296 - mse: 489.6448 - mae: 3.2296 - val_loss: 2.2848 - val_mse: 231.4359 - val_mae: 2.2848
Epoch 12/20
39311/39311 - 3s - loss: 3.2342 - mse: 493.7725 - mae: 3.2342 - val_loss: 2.2386 - val_mse: 232.5520 - val_mae: 2.2386
Epoch 13/20
39311/39311 - 2s - loss: 3.2256 - mse: 490.4494 - mae: 3.2256 - val_loss: 2.2447 - val_mse: 234.2167 - val_mae: 2.2447
Epoch 14/20
39311/39311 - 2s - loss: 3.2158 - mse: 488.7504 - mae: 3.2158 - val_loss: 2.2661 - val_mse: 235.6802 - val_mae: 2.2661
Epoch 15/20
39311/39311 - 2s - loss: 3.1984 - mse: 485.0881 - mae: 3.1984 - val_loss: 2.2331 - val_mse: 231.9371 - val_mae: 2.2331
Epoch 16/20
39311/39311 - 2s - loss: 3.2124 - mse: 489.5925 - mae: 3.2124 - val_loss: 2.2516 - val_mse: 235.3599 - val_mae: 2.2516
Epoch 17/20
39311/39311 - 2s - loss: 3.2108 - mse: 489.8666 - mae: 3.2108 - val_loss: 2.2273 - val_mse: 232.0704 - val_mae: 2.2273
Epoch 18/20
39311/39311 - 2s - loss: 3.1988 - mse: 486.9035 - mae: 3.1988 - val_loss: 2.2414 - val_mse: 231.4764 - val_mae: 2.2414
Epoch 19/20
39311/39311 - 2s - loss: 3.1724 - mse: 474.0025 - mae: 3.1724 - val_loss: 2.2823 - val_mse: 235.5647 - val_mae: 2.2823
Epoch 20/20
39311/39311 - 2s - loss: 3.1886 - mse: 480.1780 - mae: 3.1886 - val_loss: 2.2775 - val_mse: 232.9865 - val_mae: 2.2775
In [74]:
# MAE and MSE using Neural Networks
y_pred = model.predict(X_test)
# Mean Errors for Neural Networks
prediction_errors['Neural Networks'] = [MAE(y_test,y_pred),MSE(y_test,y_pred)]
print(f'Mean absolute error for NN is {MAE(y_test,y_pred)}')
print(f'Mean standard error for NN is {MSE(y_test,y_pred)}')
Mean absolute error for NN is 3.128617542037161
Mean standard error for NN is 509.8695952644622
In [75]:
# Plot training & validation loss values for Neural Networks
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss: Mean Absolue Error')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()
Out[75]:
<matplotlib.legend.Legend at 0x1a3cd366a0>
In [77]:
# Summary of errors for all models used
prediction_errors = pd.DataFrame(prediction_errors)
prediction_errors.index = ['Mean Absoluter Error', 'Mean Standard Error']
prediction_errors.T
Out[77]:
Mean Absoluter Error Mean Standard Error
Base 6.340976 611.406509
Linear Regression 5.613454 497.722909
Random Forest 4.920883 489.640870
Ridge Regression 5.619663 497.723412
Neural Networks 3.128618 509.869595